knitr::opts_chunk$set(message=FALSE, warning=FALSE, eval=T, cache=F)
For today’s workshop, we’re going to use R to go through a typical bioinformatics analysis workflow. We’re going to use common bioinformatics techniques to visualize data and make beautiful figures.
The data we will analyze is breast cancer RNA-Seq data from TCGA, a popular publicly-available database for cancer-related datasets. The goal of the analysis will be to identify genes that show significant changes in expression between normal and tumor tissues, followed by identifying the pathways they are associated with. After importing the data and performing some data pre-processing, we will carry out differntial expression analysis and gene set enrichment analysis.
Main steps in today’s workshop:
Make sure to have the following packages installed for this workshop:
BiobasedplyrDESeq2fgseaggplot2msigdbrfgseaAn expression set is a data object consisting of three entities: the expression matrix (exprs), the phenotye data (pData), and the feature data (fData).
We read in the RDS file included in this repo. It corresponds to a subset of samples from a gene expression dataset of breast cancer (BRCA) primary tissue samples from the TCGA project.
library(Biobase)
library(magrittr)
library(dplyr)
library(ggplot2)
library(Biobase)
library(ggfortify)
library(plotly)
brca <- readRDS("data/TCGA-BRCA.rds")
# dimensions of the expression data
dim(brca)
## Features Samples
## 36812 1222
# dimensions of the gene annotation
dim(fData(brca))
## [1] 36812 4
# first few rows of gene annotations
head(fData(brca)[,c("ensembl_transcript_id", "ensembl_gene_id", "hgnc_symbol")])
## ensembl_transcript_id ensembl_gene_id hgnc_symbol
## TSPAN6 ENSG00000000003.13 ENSG00000000003 TSPAN6
## TNMD ENSG00000000005.5 ENSG00000000005 TNMD
## DPM1 ENSG00000000419.11 ENSG00000000419 DPM1
## SCYL3 ENSG00000000457.12 ENSG00000000457 SCYL3
## C1orf112 ENSG00000000460.15 ENSG00000000460 C1orf112
## FGR ENSG00000000938.11 ENSG00000000938 FGR
# dimensions of the phenotypic annotation
dim(pData(brca))
## [1] 1222 65
# first few rows of phenotype
head(pData(brca)[,c("patient_id", "sample_type", "tumor_subtype")])
## patient_id sample_type tumor_subtype
## TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A8-A085 Primary Tumor LumB
## TCGA-A2-A0SY-01A-31R-A084-07 TCGA-A2-A0SY Primary Tumor LumA
## TCGA-AR-A24Z-01A-11R-A169-07 TCGA-AR-A24Z Primary Tumor LumB
## TCGA-D8-A1XU-01A-11R-A14M-07 TCGA-D8-A1XU Primary Tumor LumA
## TCGA-A1-A0SN-01A-11R-A144-07 TCGA-A1-A0SN Primary Tumor Her2
## TCGA-D8-A73W-01A-22R-A352-07 TCGA-D8-A73W Primary Tumor LumB
# how many of each sample type?
table(pData(brca)$sample_type)
##
## Metastatic Primary Tumor Solid Tissue Normal
## 7 1102 113
# how many tumor subtypes?
table(pData(brca)$tumor_subtype)
##
## Basal Her2 LumA LumB Normal
## 169 209 510 198 16
exprs(brca) <- log2(exprs(brca) + 1)
exprs(brca)[1:5,1:5]
## TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A2-A0SY-01A-31R-A084-07
## TSPAN6 13.975579 10.981567
## TNMD 1.584963 6.189825
## DPM1 11.156715 10.822571
## SCYL3 10.590587 10.946906
## C1orf112 9.519636 9.339850
## TCGA-AR-A24Z-01A-11R-A169-07 TCGA-D8-A1XU-01A-11R-A14M-07
## TSPAN6 12.302353 12.463013
## TNMD 4.459432 2.807355
## DPM1 11.945444 12.266494
## SCYL3 10.611025 11.149747
## C1orf112 9.388017 9.400879
## TCGA-A1-A0SN-01A-11R-A144-07
## TSPAN6 9.714246
## TNMD 1.000000
## DPM1 12.419960
## SCYL3 11.136350
## C1orf112 9.884171
Start by ranking genes based on their variation across samples
row.var <- sort(apply(exprs(brca), 1, var), decreasing=TRUE)
head(row.var)
## CLEC3A SCGB2A2 CPB1 TFF1 SCGB1D2 KCNJ3
## 29.73892 25.49291 24.59669 21.00591 20.25785 19.56774
To save time, we’ll run PCA on the top 2500 most variable genes
df <- brca[names(row.var)[1:2500]] %>%
exprs() %>%
t() %>%
data.frame()
pca <- prcomp(df)
pca.summary <- summary(pca)
pca.summary$importance[,1:5]
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 47.42767 40.65232 29.32491 23.58570 20.28785
## Proportion of Variance 0.14934 0.10972 0.05709 0.03693 0.02733
## Cumulative Proportion 0.14934 0.25906 0.31615 0.35309 0.38041
df$tumor_subtype <- brca$tumor_subtype
autoplot(pca, data=df, colour='tumor_subtype')
df.pca <- cbind(pca$x[,c(1:3)], brca$tumor_subtype) %>%
as.data.frame() %>%
set_colnames(c("PC1", "PC2", "PC3", "tumor_subtype"))
head(df.pca)
## PC1 PC2
## TCGA-A8-A085-01A-11R-A00Z-07 -89.9895594300379 14.4728151158205
## TCGA-A2-A0SY-01A-31R-A084-07 8.62013654000677 -39.5325958432916
## TCGA-AR-A24Z-01A-11R-A169-07 -41.8816734638114 -26.6304385642973
## TCGA-D8-A1XU-01A-11R-A14M-07 -22.8004029255938 -38.3033437021283
## TCGA-A1-A0SN-01A-11R-A144-07 -26.3108798601652 1.88579216258153
## TCGA-D8-A73W-01A-22R-A352-07 -46.7262990133077 6.82995550570086
## PC3 tumor_subtype
## TCGA-A8-A085-01A-11R-A00Z-07 -38.4190093513565 LumB
## TCGA-A2-A0SY-01A-31R-A084-07 7.54869728087252 LumA
## TCGA-AR-A24Z-01A-11R-A169-07 11.3252895436753 LumB
## TCGA-D8-A1XU-01A-11R-A14M-07 24.3111755582573 LumA
## TCGA-A1-A0SN-01A-11R-A144-07 21.69949616053 Her2
## TCGA-D8-A73W-01A-22R-A352-07 -12.9151476451899 LumB
p <- plot_ly(df.pca,
x = ~PC1,
y = ~PC2,
z = ~PC3,
type="scatter3d",
mode = "markers",
color = ~tumor_subtype,
marker = list(size = 3))
p
genes <- c("FOXA1","MLPH","AR","GATA3","DNALI1","FAM47E","RHOB","SPDEF",
"SLC7A8","TTC6","CA12","SMIM14","C5AR2","SIDT1","NOSTRIN","CCDC125",
"FAM198B-AS1","TBC1D9","SLC44A4","DYNLRB2","THSD4","FAM214A","GTF2IP7","SLC22A5",
"CCDC170")
pcs <- brca[genes,] %>%
exprs() %>%
t() %>%
data.frame() %>%
prcomp() %>%
.[["x"]]
df <- cbind(pcs[,"PC1",drop=F], pData(brca))
head(df)
## PC1 full_id
## TCGA-A8-A085-01A-11R-A00Z-07 -7.550408 TCGA-A8-A085-01A-11R-A00Z-07
## TCGA-A2-A0SY-01A-31R-A084-07 -6.881499 TCGA-A2-A0SY-01A-31R-A084-07
## TCGA-AR-A24Z-01A-11R-A169-07 -9.659289 TCGA-AR-A24Z-01A-11R-A169-07
## TCGA-D8-A1XU-01A-11R-A14M-07 -7.164274 TCGA-D8-A1XU-01A-11R-A14M-07
## TCGA-A1-A0SN-01A-11R-A144-07 -3.429342 TCGA-A1-A0SN-01A-11R-A144-07
## TCGA-D8-A73W-01A-22R-A352-07 -5.565336 TCGA-D8-A73W-01A-22R-A352-07
## patient_id sample_id
## TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A8-A085 TCGA-A8-A085-01A
## TCGA-A2-A0SY-01A-31R-A084-07 TCGA-A2-A0SY TCGA-A2-A0SY-01A
## TCGA-AR-A24Z-01A-11R-A169-07 TCGA-AR-A24Z TCGA-AR-A24Z-01A
## TCGA-D8-A1XU-01A-11R-A14M-07 TCGA-D8-A1XU TCGA-D8-A1XU-01A
## TCGA-A1-A0SN-01A-11R-A144-07 TCGA-A1-A0SN TCGA-A1-A0SN-01A
## TCGA-D8-A73W-01A-22R-A352-07 TCGA-D8-A73W TCGA-D8-A73W-01A
## case_id
## TCGA-A8-A085-01A-11R-A00Z-07 3c08aabd-d5b5-4bbe-857c-38a7527b2163
## TCGA-A2-A0SY-01A-31R-A084-07 dc696e3c-f448-468f-a576-f4429be0338a
## TCGA-AR-A24Z-01A-11R-A169-07 9fefbe7c-f66a-4940-843e-285cb7b392c1
## TCGA-D8-A1XU-01A-11R-A14M-07 332148f5-f070-4c20-8eb1-4d8c0673aa52
## TCGA-A1-A0SN-01A-11R-A144-07 0dc337fa-da8b-42c4-b9a7-fb76d81c161f
## TCGA-D8-A73W-01A-22R-A352-07 ea8dbc7a-54c6-469c-865e-f49d00b0223d
## submitter_id project_id gender year_of_birth
## TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A8-A085 TCGA-BRCA male 1964
## TCGA-A2-A0SY-01A-31R-A084-07 TCGA-A2-A0SY TCGA-BRCA female 1945
## TCGA-AR-A24Z-01A-11R-A169-07 TCGA-AR-A24Z TCGA-BRCA female 1949
## TCGA-D8-A1XU-01A-11R-A14M-07 TCGA-D8-A1XU TCGA-BRCA female 1954
## TCGA-A1-A0SN-01A-11R-A144-07 TCGA-A1-A0SN TCGA-BRCA female 1957
## TCGA-D8-A73W-01A-22R-A352-07 TCGA-D8-A73W TCGA-BRCA female 1934
## race ethnicity
## TCGA-A8-A085-01A-11R-A00Z-07 not reported not reported
## TCGA-A2-A0SY-01A-31R-A084-07 white not hispanic or latino
## TCGA-AR-A24Z-01A-11R-A169-07 white not reported
## TCGA-D8-A1XU-01A-11R-A14M-07 white not hispanic or latino
## TCGA-A1-A0SN-01A-11R-A144-07 white not hispanic or latino
## TCGA-D8-A73W-01A-22R-A352-07 white not hispanic or latino
## year_of_death classification_of_tumor
## TCGA-A8-A085-01A-11R-A00Z-07 -- not reported
## TCGA-A2-A0SY-01A-31R-A084-07 -- not reported
## TCGA-AR-A24Z-01A-11R-A169-07 -- not reported
## TCGA-D8-A1XU-01A-11R-A14M-07 -- not reported
## TCGA-A1-A0SN-01A-11R-A144-07 -- not reported
## TCGA-D8-A73W-01A-22R-A352-07 -- not reported
## last_known_disease_status
## TCGA-A8-A085-01A-11R-A00Z-07 not reported
## TCGA-A2-A0SY-01A-31R-A084-07 not reported
## TCGA-AR-A24Z-01A-11R-A169-07 not reported
## TCGA-D8-A1XU-01A-11R-A14M-07 not reported
## TCGA-A1-A0SN-01A-11R-A144-07 not reported
## TCGA-D8-A73W-01A-22R-A352-07 not reported
## primary_diagnosis tumor_stage
## TCGA-A8-A085-01A-11R-A00Z-07 Infiltrating duct carcinoma, NOS stage iib
## TCGA-A2-A0SY-01A-31R-A084-07 Lobular carcinoma, NOS stage iiia
## TCGA-AR-A24Z-01A-11R-A169-07 Infiltrating duct carcinoma, NOS stage iia
## TCGA-D8-A1XU-01A-11R-A14M-07 Infiltrating duct carcinoma, NOS stage ia
## TCGA-A1-A0SN-01A-11R-A144-07 Infiltrating duct carcinoma, NOS stage iia
## TCGA-D8-A73W-01A-22R-A352-07 Mucinous adenocarcinoma stage iiia
## age_at_diagnosis vital_status morphology
## TCGA-A8-A085-01A-11R-A00Z-07 16377 alive 8500/3
## TCGA-A2-A0SY-01A-31R-A084-07 22928 alive 8520/3
## TCGA-AR-A24Z-01A-11R-A169-07 20900 alive 8500/3
## TCGA-D8-A1XU-01A-11R-A14M-07 20715 alive 8500/3
## TCGA-A1-A0SN-01A-11R-A144-07 18401 alive 8500/3
## TCGA-D8-A73W-01A-22R-A352-07 29125 dead 8480/3
## days_to_death
## TCGA-A8-A085-01A-11R-A00Z-07 --
## TCGA-A2-A0SY-01A-31R-A084-07 --
## TCGA-AR-A24Z-01A-11R-A169-07 --
## TCGA-D8-A1XU-01A-11R-A14M-07 --
## TCGA-A1-A0SN-01A-11R-A144-07 --
## TCGA-D8-A73W-01A-22R-A352-07 385.0
## days_to_last_known_disease_status
## TCGA-A8-A085-01A-11R-A00Z-07 --
## TCGA-A2-A0SY-01A-31R-A084-07 --
## TCGA-AR-A24Z-01A-11R-A169-07 --
## TCGA-D8-A1XU-01A-11R-A14M-07 --
## TCGA-A1-A0SN-01A-11R-A144-07 --
## TCGA-D8-A73W-01A-22R-A352-07 --
## days_to_recurrence tumor_grade
## TCGA-A8-A085-01A-11R-A00Z-07 -- not reported
## TCGA-A2-A0SY-01A-31R-A084-07 -- not reported
## TCGA-AR-A24Z-01A-11R-A169-07 -- not reported
## TCGA-D8-A1XU-01A-11R-A14M-07 -- not reported
## TCGA-A1-A0SN-01A-11R-A144-07 -- not reported
## TCGA-D8-A73W-01A-22R-A352-07 -- not reported
## tissue_or_organ_of_origin days_to_birth
## TCGA-A8-A085-01A-11R-A00Z-07 Breast, NOS -16377.0
## TCGA-A2-A0SY-01A-31R-A084-07 Breast, NOS -22928.0
## TCGA-AR-A24Z-01A-11R-A169-07 Breast, NOS -20900.0
## TCGA-D8-A1XU-01A-11R-A14M-07 Breast, NOS -20715.0
## TCGA-A1-A0SN-01A-11R-A144-07 Breast, NOS -18401.0
## TCGA-D8-A73W-01A-22R-A352-07 Breast, NOS -29125.0
## progression_or_recurrence prior_malignancy
## TCGA-A8-A085-01A-11R-A00Z-07 not reported not reported
## TCGA-A2-A0SY-01A-31R-A084-07 not reported not reported
## TCGA-AR-A24Z-01A-11R-A169-07 not reported not reported
## TCGA-D8-A1XU-01A-11R-A14M-07 not reported not reported
## TCGA-A1-A0SN-01A-11R-A144-07 not reported not reported
## TCGA-D8-A73W-01A-22R-A352-07 not reported not reported
## site_of_resection_or_biopsy
## TCGA-A8-A085-01A-11R-A00Z-07 Breast, NOS
## TCGA-A2-A0SY-01A-31R-A084-07 Breast, NOS
## TCGA-AR-A24Z-01A-11R-A169-07 Breast, NOS
## TCGA-D8-A1XU-01A-11R-A14M-07 Breast, NOS
## TCGA-A1-A0SN-01A-11R-A144-07 Breast, NOS
## TCGA-D8-A73W-01A-22R-A352-07 Breast, NOS
## days_to_last_follow_up therapeutic_agents
## TCGA-A8-A085-01A-11R-A00Z-07 1124.0 --
## TCGA-A2-A0SY-01A-31R-A084-07 1347.0 --
## TCGA-AR-A24Z-01A-11R-A169-07 3001.0 --
## TCGA-D8-A1XU-01A-11R-A14M-07 395.0 --
## TCGA-A1-A0SN-01A-11R-A144-07 1196.0 --
## TCGA-D8-A73W-01A-22R-A352-07 244.0 --
## treatment_intent_type treatment_or_therapy
## TCGA-A8-A085-01A-11R-A00Z-07 -- --
## TCGA-A2-A0SY-01A-31R-A084-07 -- --
## TCGA-AR-A24Z-01A-11R-A169-07 -- --
## TCGA-D8-A1XU-01A-11R-A14M-07 -- --
## TCGA-A1-A0SN-01A-11R-A144-07 -- --
## TCGA-D8-A73W-01A-22R-A352-07 -- --
## sample_submitter_id case_submitter_id
## TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A8-A085-01A TCGA-A8-A085
## TCGA-A2-A0SY-01A-31R-A084-07 TCGA-A2-A0SY-01A TCGA-A2-A0SY
## TCGA-AR-A24Z-01A-11R-A169-07 TCGA-AR-A24Z-01A TCGA-AR-A24Z
## TCGA-D8-A1XU-01A-11R-A14M-07 TCGA-D8-A1XU-01A TCGA-D8-A1XU
## TCGA-A1-A0SN-01A-11R-A144-07 TCGA-A1-A0SN-01A TCGA-A1-A0SN
## TCGA-D8-A73W-01A-22R-A352-07 TCGA-D8-A73W-01A TCGA-D8-A73W
## sample_type_id
## TCGA-A8-A085-01A-11R-A00Z-07 1
## TCGA-A2-A0SY-01A-31R-A084-07 1
## TCGA-AR-A24Z-01A-11R-A169-07 1
## TCGA-D8-A1XU-01A-11R-A14M-07 1
## TCGA-A1-A0SN-01A-11R-A144-07 1
## TCGA-D8-A73W-01A-22R-A352-07 1
## time_between_excision_and_freezing
## TCGA-A8-A085-01A-11R-A00Z-07 --
## TCGA-A2-A0SY-01A-31R-A084-07 --
## TCGA-AR-A24Z-01A-11R-A169-07 --
## TCGA-D8-A1XU-01A-11R-A14M-07 --
## TCGA-A1-A0SN-01A-11R-A144-07 --
## TCGA-D8-A73W-01A-22R-A352-07 --
## oct_embedded tumor_code_id
## TCGA-A8-A085-01A-11R-A00Z-07 false --
## TCGA-A2-A0SY-01A-31R-A084-07 true --
## TCGA-AR-A24Z-01A-11R-A169-07 true --
## TCGA-D8-A1XU-01A-11R-A14M-07 false --
## TCGA-A1-A0SN-01A-11R-A144-07 true --
## TCGA-D8-A73W-01A-22R-A352-07 false --
## intermediate_dimension is_ffpe
## TCGA-A8-A085-01A-11R-A00Z-07 -- False
## TCGA-A2-A0SY-01A-31R-A084-07 -- False
## TCGA-AR-A24Z-01A-11R-A169-07 -- False
## TCGA-D8-A1XU-01A-11R-A14M-07 -- False
## TCGA-A1-A0SN-01A-11R-A144-07 -- False
## TCGA-D8-A73W-01A-22R-A352-07 -- False
## pathology_report_uuid
## TCGA-A8-A085-01A-11R-A00Z-07 64F84FF4-A477-4E1E-B4BB-E5614517229E
## TCGA-A2-A0SY-01A-31R-A084-07 8E6902A6-A673-46CC-9AEB-3A71EF11099F
## TCGA-AR-A24Z-01A-11R-A169-07 AD07F611-0EEA-4890-A02C-6DA3F5F57C45
## TCGA-D8-A1XU-01A-11R-A14M-07 845F8FCF-CF3C-4CEF-B673-A57DE626939C
## TCGA-A1-A0SN-01A-11R-A144-07 D0269758-EFAE-4EBA-8CCF-4A6CF4D4B35A
## TCGA-D8-A73W-01A-22R-A352-07 359DB5F2-BD23-42E1-B316-9D908DBACD78
## tumor_descriptor sample_type
## TCGA-A8-A085-01A-11R-A00Z-07 -- Primary Tumor
## TCGA-A2-A0SY-01A-31R-A084-07 -- Primary Tumor
## TCGA-AR-A24Z-01A-11R-A169-07 -- Primary Tumor
## TCGA-D8-A1XU-01A-11R-A14M-07 -- Primary Tumor
## TCGA-A1-A0SN-01A-11R-A144-07 -- Primary Tumor
## TCGA-D8-A73W-01A-22R-A352-07 -- Primary Tumor
## distance_normal_to_tumor
## TCGA-A8-A085-01A-11R-A00Z-07 released
## TCGA-A2-A0SY-01A-31R-A084-07 released
## TCGA-AR-A24Z-01A-11R-A169-07 released
## TCGA-D8-A1XU-01A-11R-A14M-07 released
## TCGA-A1-A0SN-01A-11R-A144-07 released
## TCGA-D8-A73W-01A-22R-A352-07 released
## biospecimen_anatomic_site state
## TCGA-A8-A085-01A-11R-A00Z-07 -- --
## TCGA-A2-A0SY-01A-31R-A084-07 -- --
## TCGA-AR-A24Z-01A-11R-A169-07 -- --
## TCGA-D8-A1XU-01A-11R-A14M-07 -- --
## TCGA-A1-A0SN-01A-11R-A144-07 -- --
## TCGA-D8-A73W-01A-22R-A352-07 -- --
## diagnosis_pathologically_confirmed
## TCGA-A8-A085-01A-11R-A00Z-07 --
## TCGA-A2-A0SY-01A-31R-A084-07 --
## TCGA-AR-A24Z-01A-11R-A169-07 --
## TCGA-D8-A1XU-01A-11R-A14M-07 --
## TCGA-A1-A0SN-01A-11R-A144-07 --
## TCGA-D8-A73W-01A-22R-A352-07 --
## current_weight composition
## TCGA-A8-A085-01A-11R-A00Z-07 -- --
## TCGA-A2-A0SY-01A-31R-A084-07 -- --
## TCGA-AR-A24Z-01A-11R-A169-07 -- --
## TCGA-D8-A1XU-01A-11R-A14M-07 -- --
## TCGA-A1-A0SN-01A-11R-A144-07 -- --
## TCGA-D8-A73W-01A-22R-A352-07 -- --
## time_between_clamping_and_freezing
## TCGA-A8-A085-01A-11R-A00Z-07 --
## TCGA-A2-A0SY-01A-31R-A084-07 --
## TCGA-AR-A24Z-01A-11R-A169-07 --
## TCGA-D8-A1XU-01A-11R-A14M-07 --
## TCGA-A1-A0SN-01A-11R-A144-07 --
## TCGA-D8-A73W-01A-22R-A352-07 --
## distributor_reference shortest_dimension
## TCGA-A8-A085-01A-11R-A00Z-07 -- --
## TCGA-A2-A0SY-01A-31R-A084-07 -- --
## TCGA-AR-A24Z-01A-11R-A169-07 -- --
## TCGA-D8-A1XU-01A-11R-A14M-07 -- --
## TCGA-A1-A0SN-01A-11R-A144-07 -- --
## TCGA-D8-A73W-01A-22R-A352-07 -- --
## method_of_sample_procurement tumor_code
## TCGA-A8-A085-01A-11R-A00Z-07 -- 788
## TCGA-A2-A0SY-01A-31R-A084-07 -- 1083
## TCGA-AR-A24Z-01A-11R-A169-07 -- 1673
## TCGA-D8-A1XU-01A-11R-A14M-07 -- 102
## TCGA-A1-A0SN-01A-11R-A144-07 -- 1091
## TCGA-D8-A73W-01A-22R-A352-07 -- 191
## passage_count tissue_type
## TCGA-A8-A085-01A-11R-A00Z-07 130.0 --
## TCGA-A2-A0SY-01A-31R-A084-07 510.0 --
## TCGA-AR-A24Z-01A-11R-A169-07 120.0 --
## TCGA-D8-A1XU-01A-11R-A14M-07 210.0 --
## TCGA-A1-A0SN-01A-11R-A144-07 120.0 --
## TCGA-D8-A73W-01A-22R-A352-07 230.0 --
## biospecimen_laterality
## TCGA-A8-A085-01A-11R-A00Z-07 --
## TCGA-A2-A0SY-01A-31R-A084-07 --
## TCGA-AR-A24Z-01A-11R-A169-07 --
## TCGA-D8-A1XU-01A-11R-A14M-07 --
## TCGA-A1-A0SN-01A-11R-A144-07 --
## TCGA-D8-A73W-01A-22R-A352-07 --
## days_to_sample_procurement freezing_method
## TCGA-A8-A085-01A-11R-A00Z-07 -- --
## TCGA-A2-A0SY-01A-31R-A084-07 -- --
## TCGA-AR-A24Z-01A-11R-A169-07 -- --
## TCGA-D8-A1XU-01A-11R-A14M-07 -- --
## TCGA-A1-A0SN-01A-11R-A144-07 -- --
## TCGA-D8-A73W-01A-22R-A352-07 -- --
## preservation_method growth_rate
## TCGA-A8-A085-01A-11R-A00Z-07 -- --
## TCGA-A2-A0SY-01A-31R-A084-07 -- --
## TCGA-AR-A24Z-01A-11R-A169-07 -- --
## TCGA-D8-A1XU-01A-11R-A14M-07 -- --
## TCGA-A1-A0SN-01A-11R-A144-07 -- --
## TCGA-D8-A73W-01A-22R-A352-07 -- --
## days_to_collection catalog_reference
## TCGA-A8-A085-01A-11R-A00Z-07 -- --
## TCGA-A2-A0SY-01A-31R-A084-07 -- --
## TCGA-AR-A24Z-01A-11R-A169-07 -- --
## TCGA-D8-A1XU-01A-11R-A14M-07 -- --
## TCGA-A1-A0SN-01A-11R-A144-07 -- --
## TCGA-D8-A73W-01A-22R-A352-07 -- --
## initial_weight longest_dimension
## TCGA-A8-A085-01A-11R-A00Z-07 -- --
## TCGA-A2-A0SY-01A-31R-A084-07 -- --
## TCGA-AR-A24Z-01A-11R-A169-07 -- --
## TCGA-D8-A1XU-01A-11R-A14M-07 -- --
## TCGA-A1-A0SN-01A-11R-A144-07 -- --
## TCGA-D8-A73W-01A-22R-A352-07 -- --
## tumor_subtype
## TCGA-A8-A085-01A-11R-A00Z-07 LumB
## TCGA-A2-A0SY-01A-31R-A084-07 LumA
## TCGA-AR-A24Z-01A-11R-A169-07 LumB
## TCGA-D8-A1XU-01A-11R-A14M-07 LumA
## TCGA-A1-A0SN-01A-11R-A144-07 Her2
## TCGA-D8-A73W-01A-22R-A352-07 LumB
ex1 <- df %>%
filter(!is.na(tumor_subtype)) %>%
group_by(tumor_subtype) %>%
summarize(PC1_mean = mean(PC1, na.rm=T))
print(ex1)
## # A tibble: 5 x 2
## tumor_subtype PC1_mean
## <chr> <dbl>
## 1 Basal 16.8
## 2 Her2 -0.413
## 3 LumA -4.57
## 4 LumB -4.23
## 5 Normal 10.4
ggplot(data=ex1, aes(x=tumor_subtype, y=PC1_mean)) +
geom_point(aes(shape=tumor_subtype, color=tumor_subtype), size=10)+
ylab("PC1 ~ Basal Genes") +
xlab("Tumor Subtype") +
labs(title="Important Genes",
subtitle="Average PC1 Across Tumor Subtypes",
caption="Figure 1: Example of data wrangling into ggplot")
Make a boxplot of PC1 across tumor subtypes.
Make a scatter of PC1 across tumor subtypes.
Make a boxplot of PC1 across tumor stages within each gender.
Make a density plot of PC1 across tumor stages within females.
Make a correlation plot of PC1 with age.